#!/usr/bin/env Rscript
library(text2vec)
library(quanteda)
library(dplyr)
library(topicmodels)
library(ldatuning)
library(parallel)
library(doParallel)
library(lubridate)

# ================================
# arguments (used to facilitate HPC processing)
# ================================
args <- commandArgs(trailingOnly = TRUE)
if(length(args)!=2) stop(paste0("Not the right number of arguments!", args))
args <- as.integer(args)

# ================================
# define paths
# ================================
in_path <- "/scratch/plr250/DiscursosVZ/Inputs/"  # HPC path
out_path <- "/scratch/plr250/DiscursosVZ/TopicModels/Outputs/"  # HPC path

# load corpus
corpus <- readRDS(paste0(in_path, "chavez_discourse_preprocessed_colloc.rds"))
corpus <- corpus %>% mutate(date = as.Date(corpus$date, "%d/%m/%Y")) %>% arrange(date)
corpus <- corpus %>% mutate(year = lubridate::year(as.Date(date, "%d/%m/%Y")))
corpus <- corpus %>% filter(year >= 1998)

#==================================
# create and prune vocab
#==================================
# stopwords
stopwords_es <- unique(stopwords::data_stopwords_stopwordsiso$es, stopwords::data_stopwords_snowball$es)
stopwords_es <- lapply(stopwords_es, function(x) chartr("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", "aaaeiouaeioouuunAEIOUAEIOUN", x)) %>% unlist
stopwords_es <- gsub("[^a-zA-Z]", "", stopwords_es)  # keep only text
stopwords_es <- stopwords_es[stopwords_es!=""]

# select vocab
tokens <- space_tokenizer(corpus$text)
it <- itoken(tokens, progressbar = FALSE)
vocab <- create_vocabulary(it, stopwords = stopwords_es)
vocab <- prune_vocabulary(vocab, term_count_min = 100)  # subset vocab

# load nouns & collocations
nouns <- readRDS(paste0(in_path, "ch_nouns_0.5.rds"))
collocations <- readRDS(paste0(in_path, "collocations.rds"))
nouns <- c(nouns$token, collocations$replacement, "constituyente", "constituyentes")
vocab <- vocab %>% filter(term %in% nouns)

# dfm
dfm_corpus <- dfm(corpus$text)  # create dfm
dfm_corpus <- dfm_select(dfm_corpus, vocab$term)  # subset dfm features using vocab
empty_docs <- rowSums(as.matrix(dfm_corpus))  # check there are no empty documents after subsetting
dfm_corpus <- dfm_subset(dfm_corpus, empty_docs > 0)

# optimal number of topics
k_optimize_tm <- FindTopicsNumber(
  dfm_corpus,
  topics = seq(from = args[1], to = args[2], by = 5),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(verbose=25L, seed = 123, burnin = 100, iter = 200),
  mc.cores = detectCores(), # to usa all cores available
  verbose = TRUE
)

# save output
saveRDS(k_optimize_tm, file = paste0(out_path, "k_optimize_tm_", args[1], "_", args[2], ".rds"))

